!! Copyright (C) 2009,2010,2011,2012  Marco Restelli
!!
!! This file is part of:
!!   LDGH -- Local Hybridizable Discontinuous Galerkin toolkit
!!
!! LDGH is free software: you can redistribute it and/or modify it
!! under the terms of the GNU General Public License as published by
!! the Free Software Foundation, either version 3 of the License, or
!! (at your option) any later version.
!!
!! LDGH is distributed in the hope that it will be useful, but WITHOUT
!! ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
!! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
!! License for more details.
!!
!! You should have received a copy of the GNU General Public License
!! along with LDGH. If not, see <http://www.gnu.org/licenses/>.
!!
!! author: Marco Restelli                   <marco.restelli@gmail.com>


!>\brief
!! In this module the local matrix and right hand side for the CG
!! method are computed for the Stokes/Navier-Stokes problem.
!!
!! \n
!!
!! The Navier-Stokes problem is written as
!! \f{displaymath}{
!!  \begin{array}{rcll}
!!  -\nabla\cdot\left[\mu
!!    \left(\nabla\underline{u}+(\nabla\underline{u})^T\right)\right]
!!    + \underline{u}\cdot\nabla\underline{u}
!!    + \nabla p &=& \underline{f} & {\rm
!!    in}\,\Omega \\\
!!  -\nabla\cdot\underline{u} & = & 0 & {\rm in}\,\Omega \\\
!!  \underline{u}&=&\underline{u}_D & {\rm on}\,\partial\Omega_D \\\
!!  \left(\mu\left(\nabla\underline{u}+(\nabla\underline{u})^T\right)
!!    -p\right)\cdot\underline{n} &=& \underline{h} & 
!!      {\rm on}\,\partial\Omega_N.
!!  \end{array}
!! \f}
!! This form is equivalent to the usual one
!! \f{displaymath}{
!!  \begin{array}{rcll}
!!  -\mu\Delta\underline{u}
!!    + \underline{u}\cdot\nabla\underline{u}
!!    + \nabla p &=& \underline{f} & {\rm
!!    in}\,\Omega \\\
!!  -\nabla\cdot\underline{u} & = & 0 & {\rm in}\,\Omega
!!  \end{array}
!! \f}
!! if \f$\mu=const\f$, however when \f$\mu\neq const\f$ only the first
!! form is consistent with the axioms of continuous mechanics. It
!! should be mentioned that the first form yields a denser matrix than
!! the second.
!! 
!! The local matrices do not include any static condensation; they
!! include the Neumann type boundary conditions but not the Dirichlet
!! type ones.
!!
!! This module implements the following cases (in any possible
!! combination):
!! <ul>
!!  <li> steady state vs. time evolution
!!  <li> implicit advection vs. explicit advection
!!  <li> centered space discretization vs. N-scheme/PSI upwinding
!!  (only for first order velocity) and gradient jump stabilization
!!  <li> stable mixed finite elements vs. pressure stabilized finite
!!  elements.
!! </ul>
!! A detailed list of the cases that must be considered in each
!! function is as follows:
!! <ul>
!!  <li> \c cg_ns_locmat:
!!   <ul>
!!    <li> implicit advection vs. explicit advection
!!    <li> centered space discretization vs. N-scheme/PSI upwinding
!!   </ul>
!!   the computed matrices can then be used to build either steady
!!   state or time evolution problems;
!!  <li> \c cg_ns_locpstab:
!!   <ul>
!!    <li> no adjustments required;
!!   </ul>
!!  <li> \c cg_ns_locustab:
!!   <ul>
!!    <li> no adjustments required;
!!   </ul>
!!  <li> \c cg_ns_locmassmat:
!!   <ul>
!!    <li> no adjustments required;
!!   </ul>
!!  <li> \c cg_ns_locrhs:
!!   <ul>
!!    <li> centered space discretization vs. N-scheme/PSI/side based
!!    upwinding/stabilization
!!   </ul>
!!   the computed right hand side assumes a time dependent problem
!!   with explicit time discretization, since the steady state and the
!!   implicit cases can be constructed starting from \c cg_ns_locmat.
!! </ul>
!! The choice of the upwinding scheme is done by the module variable
!! \c upwinding which is set by the module constructor.
!!
!! \note There are no consistency checks on the coefficients returned
!! by the \c coeff* functions of \c mod_testcases, which thus have to
!! be correct.
!<----------------------------------------------------------------------
module mod_cg_ns_locmat

!-----------------------------------------------------------------------

 use mod_messages, only: &
   mod_messages_initialized, &
   error,   &
   warning, &
   info

 use mod_kinds, only: &
   mod_kinds_initialized, &
   wp

 use mod_linal, only: &
   mod_linal_initialized, &
   linsys

 use mod_sympoly, only: &
   mod_sympoly_initialized, &
   operator(*),  &
   me_int, var_change

 use mod_master_el, only: &
   mod_master_el_initialized

 use mod_base, only: &
   mod_base_initialized, &
   t_base

 use mod_grid, only: &
   mod_grid_initialized, &
   t_e, t_s, affmap, &
   diameter

 use mod_bcs, only: &
   mod_bcs_initialized, &
   p_t_b_e, b_dir, b_neu, b_ddc
 
 use mod_testcases, only: &
   mod_testcases_initialized, &
   coeff_visc,  &
   coeff_f,     &
   coeff_neu

!-----------------------------------------------------------------------
 
 implicit none

!-----------------------------------------------------------------------

! Module interface

 public :: &
   mod_cg_ns_locmat_constructor, &
   mod_cg_ns_locmat_destructor,  &
   mod_cg_ns_locmat_initialized, &
   cg_ns_locmat,     &
   cg_ns_locpstab,   &
   cg_ns_locustab,   &
   cg_ns_locmassmat, &
   cg_ns_locrhs,     &
   t_bcs_error

 private

!-----------------------------------------------------------------------

! Module types and parameters

 ! This type is used to indicate that an error has occurred in the
 ! computation of the boundary condition
 type t_bcs_error
   logical :: lerr
   character(len=1000) :: message
 end type t_bcs_error

! Module variables

 ! public members
 logical, protected ::               &
   mod_cg_ns_locmat_initialized = .false.
 ! private members
 integer, parameter :: & ! upwinding mechanisms
   upw_none    = 0, &
   upw_nscheme = 1, &
   upw_psi     = 2, &
   upw_side_sc = 3, &
   upw_elem    = 4
 integer :: upwinding
 logical :: asym_adv

 real(wp), allocatable :: phitens(:,:,:,:,:), phi_q(:,:,:), &
   qtens(:,:,:,:), int_q(:), psi(:,:,:,:), psib(:,:,:,:),   &
   omega(:,:,:,:,:,:), csi(:,:)
 real(wp) :: u_stab_coeff ! used only for side based adv. stabilization
 real(wp) :: p_stab_coeff ! used only for pressure stabilization
 character(len=*), parameter :: &
   this_mod_name = 'mod_cg_ns_locmat'

!-----------------------------------------------------------------------

contains

!-----------------------------------------------------------------------

 subroutine mod_cg_ns_locmat_constructor(base,pbase,pstab, &
              antisym_adv,upw,ustab)
  type(t_base), intent(in) :: base, pbase
  real(wp) :: pstab
  logical, intent(in), optional :: antisym_adv
  !> this variable can be set as follows:
  !! <ul>
  !!  <li> <tt>"none"</tt> or absent: centered method
  !!  <li> <tt>"N-scheme"</tt> : N-scheme upwinding
  !!  <li> <tt>"PSI"</tt> : PSI upwinding
  !!  <li> <tt>"side_sc"</tt> : side based, strongly consistent
  !!  <li> <tt>"elem-penalty"</tt> : element penalization
  !! </ul>
  character(len=*), intent(in), optional :: upw
  real(wp), optional :: ustab

  integer :: j, i, l, h, k, m
  real(wp) :: xv0(base%me%d), bhat(base%me%d,base%me%d-1)
  character(len=1000) :: message
  character(len=*), parameter :: &
    this_sub_name = 'constructor'

   !Consistency checks ---------------------------
   if( (mod_messages_initialized.eqv..false.) .or. &
       (mod_kinds_initialized.eqv..false.)    .or. &
       (mod_linal_initialized.eqv..false.)    .or. &
       (mod_sympoly_initialized.eqv..false.)  .or. &
       (mod_master_el_initialized.eqv..false.).or. &
       (mod_base_initialized.eqv..false.)     .or. &
       (mod_grid_initialized.eqv..false.)     .or. &
       (mod_bcs_initialized.eqv..false.)      .or. &
       (mod_testcases_initialized.eqv..false.) ) then
     call error(this_sub_name,this_mod_name, &
                'Not all the required modules are initialized.')
   endif
   if(mod_cg_ns_locmat_initialized.eqv..true.) then
     call warning(this_sub_name,this_mod_name, &
                  'Module is already initialized.')
   endif
   !----------------------------------------------

   asym_adv = .false.
   if(present(antisym_adv)) asym_adv = antisym_adv

   if(present(upw)) then
     upw_case: select case(trim(upw))
      case("none")
       upwinding = upw_none
      case("N-scheme")
       if(base%k.ne.1) call error(this_sub_name,this_mod_name, &
        'The N-scheme upwinding requires first order velocity.')
       upwinding = upw_nscheme
      case("PSI")
       if(base%k.ne.1) call error(this_sub_name,this_mod_name, &
        'The PSI upwinding requires first order velocity.')
       upwinding = upw_psi
      case("side_sc")
       if(.not.present(ustab)) call error(this_sub_name,&
        this_mod_name, &
        'The advection stabilization coefficient is missing.')
       u_stab_coeff = ustab
       upwinding = upw_side_sc
      case("elem-penalty")
       if(.not.present(ustab)) call error(this_sub_name,&
        this_mod_name, &
        'The advection stabilization coefficient is missing.')
       u_stab_coeff = ustab
       upwinding = upw_elem
      case default
       write(message,'(A,A,A)') 'Unknown upwinding "',trim(upw),'".'
       call error(this_sub_name,this_mod_name,message)
     end select upw_case
   else
     upwinding = upw_none
   endif

   allocate(phitens(base%me%d,base%me%d,base%m,base%pk,base%pk))
   do j=1,base%pk
     do i=1,base%pk
       do l=1,base%m
         do h=1,base%me%d
           do k=1,base%me%d
             phitens(h,k,l,i,j) = base%gradp(h,j,l) * base%gradp(k,i,l)
           enddo
         enddo
       enddo
     enddo
   enddo

   allocate(phi_q(base%me%d,pbase%pk,base%pk))
   phi_q = 0.0_wp
   do j=1,base%pk
     do i=1,pbase%pk
       do l=1,base%m
         phi_q(:,i,j) = phi_q(:,i,j) &
           - base%wg(l)*base%gradp(:,j,l)*pbase%p(i,l)
       enddo
     enddo
   enddo

   ! qtens is analogous to phitens, with two differences: it uses the
   ! pressure basis functions, and it includes quadrature on the
   ! reference element (and thus has one dimension less)
   allocate(qtens(base%me%d,base%me%d,pbase%pk,pbase%pk))
   do j=1,pbase%pk
     do i=1,pbase%pk
       !do l=1,base%m
       do h=1,pbase%me%d
         do k=1,pbase%me%d
           qtens(h,k,i,j) = me_int( pbase%me%d ,     &
             pbase%gradp_s(h,i) * pbase%gradp_s(k,j) )
         enddo
       enddo
     enddo
   enddo


   allocate(int_q(pbase%pk))
   int_q = 0.0_wp
   do l=1,base%m
     int_q = int_q + base%wg(l)*pbase%p(:,l)
   enddo

   ! Careful: psi is stored with indices in reversed order: k,j,i
   ! (this doesn't really matter since psi is symmetric in the first
   ! and last indices, but is speeds up the computations).
   allocate(psi(base%me%d,base%pk,base%pk,base%pk))
   do i=1,base%pk
     do j=1,base%pk
       do k=1,i ! should be base%pk, we exploit symmetry in i,k
         do h=1,base%me%d
           psi(h,k,j,i) = me_int( base%me%d ,              &
             base%p_s(i) * base%gradp_s(h,j) * base%p_s(k) )
           if(k.ne.i) psi(h,i,j,k) = psi(h,k,j,i) ! symmetry
         enddo
       enddo
     enddo
   enddo

   ! Here the difficulty is that we need to integrate the traces of
   ! the basis functions of the sides of the reference element. This
   ! requires a change of variables, from the d-dimensional variable x
   ! to the (d-1)-variable xi. We handle this as it is done in
   ! mod_fe_spaces, in function rt_vect_canonical_dofs.
   if(asym_adv) then
     allocate(psib(base%pk,base%pk,base%pk,base%me%d+1))
     do h=1,base%me%d+1 ! side loop
       xv0 = base%me%xv(:,base%me%isv(1,h)) ! vertex 0 of side h
       do k=1,base%me%d-1 ! vertex k of side h
         bhat(:,k) = base%me%xv(:,base%me%isv(k+1,h)) - xv0
       enddo
       ! The integrals are computed on the (d-1)-dimensional ref. el.
       do i=1,base%pk
         do j=1,i
           do k=1,j
             psib(k,j,i,h) = me_int( base%me%d-1 ,                   &
               var_change( base%p_s(k) * base%p_s(j) * base%p_s(i) , &
               bhat , xv0 ) )
             ! fill all the symmetric terms: check all the possible
             ! permutations of k,j,i (some of them may coincide)
             psib(k,i,j,h) = psib(k,j,i,h)
             psib(j,k,i,h) = psib(k,j,i,h)
             psib(j,i,k,h) = psib(k,j,i,h)
             psib(i,k,j,h) = psib(k,j,i,h)
             psib(i,j,k,h) = psib(k,j,i,h)
           enddo
         enddo
       enddo
     enddo
   endif

   ! To spped-up the computations, the indexes of omega are ordered as
   ! follows: (p,q,k,h,i,j), where p,q are the spatial dimension
   ! indexes of the rank two tensor omega(:,:,k,h,i,j).
   if(upwinding.eq.upw_elem) then
   allocate(omega(base%me%d,base%me%d,base%pk,base%pk,base%pk,base%pk))
     do i=1,base%pk
       do j=1,i
         do k=1,base%pk
           do h=1,k
             do l=1,base%me%d
               do m=1,base%me%d
                 omega(l,m,k,h,i,j) = me_int( base%me%d ,             &
    base%p_s(k) * base%p_s(h) * base%gradp_s(l,j) * base%gradp_s(m,i) )
               enddo ! m
             enddo ! l
             if(h.ne.k) omega(:,:,h,k,i,j) = omega(:,:,k,h,i,j)
           enddo ! h
         enddo ! k
         if(j.ne.i) then
           do k=1,base%pk
             do h=1,base%pk
               omega(:,:,h,k,j,i) = transpose(omega(:,:,h,k,i,j))
             enddo
           enddo
         endif
       enddo ! j
     enddo ! i
   endif

   allocate(csi(base%pk,base%pk))
   do j=1,base%pk
     do i=1,j
       csi(i,j) = me_int( base%me%d , base%p_s(i)*base%p_s(j) )
       csi(j,i) = csi(i,j)
     enddo
   enddo

   p_stab_coeff = pstab

   mod_cg_ns_locmat_initialized = .true.
 end subroutine mod_cg_ns_locmat_constructor

!-----------------------------------------------------------------------
 
 subroutine mod_cg_ns_locmat_destructor()
  character(len=*), parameter :: &
    this_sub_name = 'destructor'
   
   !Consistency checks ---------------------------
   if(mod_cg_ns_locmat_initialized.eqv..false.) then
     call error(this_sub_name,this_mod_name, &
                'This module is not initialized.')
   endif
   !----------------------------------------------

   deallocate(phitens,phi_q,qtens,int_q,psi,csi)
   if(asym_adv)              deallocate(psib)
   if(upwinding.eq.upw_elem) deallocate(omega)

   mod_cg_ns_locmat_initialized = .false.
 end subroutine mod_cg_ns_locmat_destructor

!-----------------------------------------------------------------------
 
 !> Computation of the local matrices and right hand side.
 !!
 !! The local matrices are
 !! \f{displaymath}{
 !!  A^K_{ij} = \int_K
 !!  \mu\left(\nabla\underline{\varphi}_j+
 !!  (\nabla\underline{\varphi}_j)^T\right) :
 !!  \nabla\underline{\varphi}_i \,dx, + D^K_{ij}(\underline{u}) \qquad
 !!  B^K_{ij} = -\int_K
 !!  \nabla\cdot\underline{\varphi}_j\, q_i \, dx
 !! \f}
 !! and the right hand side is
 !! \f{displaymath}{
 !!  f^K_{i} = \int_K \underline{f}\cdot\underline{\varphi}_i\,dx + 
 !!    \int_{\partial K\cap\partial\Omega_N}\underline{h}\cdot
 !!    \underline{\varphi}_i\,d\sigma.
 !! \f}
 !! There are also the local matrixes
 !! \f{displaymath}{
 !!  C^K_{ij} = - \tau^K \int_K \nabla q_j \cdot \nabla q_i \, dx
 !! \f}
 !! with \f$\tau^K = Ch_K^2\f$, which provides an element based
 !! pressure stabilization, and 
 !! \f{displaymath}{
 !!  \Lambda^K_i = \int_K q_i \, dx
 !! \f}
 !! which is associated with the Lagrange multiplier \f$\lambda\f$
 !! used to enforce the zero mean pressure condition. Such local
 !! matrixes are computed only if <tt>eb_p_stab.eqv..true.</tt> and
 !! <tt>div_mul.eqv..true.</tt>, respectively.
 !!
 !! The nonlinear advection term \f$D^K(\underline{u})\f$ is included
 !! only if the optional input argument \c uk is present, and, for
 !! centered advection, it is computed as
 !! \f{displaymath}{
 !!  D^K_{ij}(\underline{u}) = \sum_k u^K_k \int_K 
 !!  \left(\nabla\underline{\varphi}_j\,\underline{\varphi}_k\right)
 !!  \cdot \underline{\varphi}_i \,dx.
 !! \f}
 !! An alternative form for the advection term is also available:
 !! \f{displaymath}{
 !!  \widetilde{D}^K_{ij}(\underline{u}) = \frac{1}{2}\sum_k u^K_k 
 !!  \left[ \int_K \left(
 !!  \left(\nabla\underline{\varphi}_j\,\underline{\varphi}_k\right)
 !!  \cdot \underline{\varphi}_i -
 !!  \left(\nabla\underline{\varphi}_i\,\underline{\varphi}_k\right)
 !!  \cdot \underline{\varphi}_j \right)dx
 !!  + \int_{\partial K \cap \partial \Omega}
 !!  \left(\underline{\varphi}_k \cdot \underline{n}\right)
 !!  \left(\underline{\varphi}_j \cdot \underline{\varphi}_i\right)
 !!  d\xi \right].
 !! \f}
 !! The two alternative formulations yield the same global problem if
 !! \f$\nabla\cdot\underline{u}=0\f$.
 !!
 !! The modifications required by various upwindings are discussed
 !! later.
 !! \note Not all the upwinding methods are conveniently implemented
 !! on a <em>per element</em> basis; for instance, side based
 !! stabilizations are naturally dealt with within a side loop. This
 !! function handles element based stabilizations while ignoring other
 !! stabilizations, which have to be implemented elsewhere.
 !! \par
 !! \note The recognized upwindings are those defined as
 !! <tt>upw_*</tt> module parameters. Different methods result in the
 !! local matrices being set to <tt>-huge(0.0_wp)</tt>.
 !!
 !! \par Details on the local matrices
 !!
 !! The matrix \f$A^K\f$ is - as expected - symmetric, since
 !! \f{displaymath}{
 !!  \left(\nabla\underline{\varphi}_j +
 !!        (\nabla\underline{\varphi}_j)^T\right) :
 !!  \nabla\underline{\varphi}_i = 
 !!  \left(\nabla\underline{\varphi}_j +
 !!        (\nabla\underline{\varphi}_j)^T\right) :
 !!  \frac{1}{2}
 !!  \left(\nabla\underline{\varphi}_i +
 !!        (\nabla\underline{\varphi}_i)^T\right).
 !! \f}
 !!
 !! In this function, we assume that the velocity FE space is
 !! \f$\underline{V}_h = \left(V_h\right)^d\f$ for some scalar FE
 !! space \f$V_h\f$. Denoting by \f$\varphi_i\f$ the basis functions
 !! of \f$V_h\f$, it can be checked that each pair
 !! \f$(\varphi_i,\varphi_j)\f$ of scalar basis functions generates a
 !! \f$d\times d\f$ block \f$A^{K,d}_{ij}\f$ of \f$A^K\f$ as
 !! \f{displaymath}{
 !!   A^{K,d}_{ij} = \left[ \int_K
 !!   \mu\nabla\varphi_j\cdot\nabla\varphi_i\,dx \right] \mathcal{I}_d
 !!   + \left[ \int_K
 !!   \mu\nabla\varphi_j\otimes\nabla\varphi_i\,dx \right],
 !! \f}
 !! where \f$\mathcal{I}_d\f$ is the \f$d\f$-dimensional identity
 !! (this can be obtained observing that
 !! \f$\underline{\varphi}_i=\underline{e}_{i_d}\varphi_i\f$, with
 !! \f$i_d=1,\ldots,d\f$ and \f$i=1,\ldots,p_k\f$).
 !! Notice that, in conformity with the symmetry of \f$A^K\f$, we have
 !! \f{displaymath}{
 !!   A^{K,d}_{ij} = \left(A^{K,d}_{ji}\right)^T.
 !! \f}
 !! We choose to assemble these blocks using the spatial dimension as
 !! the innermost index, so that
 !! \f{displaymath}{
 !!   A^K = \left[
 !!    \begin{array}{rcll}
 !!     A^{K,d}_{11} & \ldots & A^{K,d}_{1p_k} \\\
 !!     \vdots & \ddots & \vdots \\\
 !!     A^{K,d}_{p_k1} & \ldots & A^{K,d}_{p_kp_k}
 !!    \end{array} \right]
 !! \f}
 !! which corresponds to the following ordering of the unknown
 !! (superscripts for the space dimension, subscripts for the basis
 !! function)
 !! \f{displaymath}{
 !!  \left[
 !!   u^1_1,u^2_1,\ldots,u^d_1,u^1_2,\ldots,u^d_2,
 !!   \ldots,u_{p_k}^1,\ldots,u_{p_k}^d
 !!  \right].
 !! \f}
 !! Coming now to the coordinate transformation from the reference
 !! element to physical space, we have
 !! \f{displaymath}{
 !!   \nabla\varphi = B^{-T}\nabla\hat{\varphi},
 !! \f}
 !! yielding
 !! \f{displaymath}{
 !!   A^{K,d}_{ij} = \left[ det(B)\int_{\hat{K}} \mu
 !!   \nabla\hat{\varphi}_j^TB^{-1}B^{-T}\nabla\hat{\varphi}_i\,d\hat{x}
 !!   \right] \mathcal{I}_d + \left[det(B)B^{-T}\int_{\hat{K}}
 !!   \mu\nabla\hat{\varphi}_j\nabla\hat{\varphi}_i^T\,d\hat{x} \,
 !!   B^{-1} \right],
 !! \f}
 !! which can be further simplified as
 !! \f{displaymath}{
 !!   A^{K,d}_{ij} = \left(det(B)B^{-1}B^{-T}:\hat{\Phi}_{ij}\right)
 !!   \mathcal{I}_d + det(B)B^{-T}\hat{\Phi}_{ij}B^{-1}
 !! \f}
 !! with
 !! \f{displaymath}{
 !!   \hat{\Phi}_{ij} = \int_{\hat{K}}
 !!   \mu\nabla\hat{\varphi}_j\nabla\hat{\varphi}_i^T\,d\hat{x}.
 !! \f}
 !! The matrix \f$B^K\f$ as well has a block structure, where each
 !! scalar function \f$\varphi_j\f$ generates a \f$1\times d\f$ block
 !! \f{displaymath}{
 !!   B^{K,d}_{ij} = -\int_K \nabla\varphi_j^T\,q_i\,dx,
 !! \f}
 !! which can be expressed in terms of the reference element as
 !! \f{displaymath}{
 !!   B^{K,d}_{ij} = -det(B)\int_{\hat{K}}
 !!   \nabla\hat{\varphi}_j^T\, \hat{q}_i\,d\hat{x}\,B^{-1}.
 !! \f}
 !! Finally, \f$D^K(\underline{u})\f$ has a block structure with
 !! \f{displaymath}{
 !!   D^{K,d}_{ij}(\underline{u}) = \sum_k \left[ \underline{u}_k^K
 !!   \cdot \int_K \varphi_k\varphi_i\nabla\varphi_j\,dx \right]
 !!   \mathcal{I}_d
 !! \f}
 !! or, in terms of the reference element,
 !! \f{displaymath}{
 !!   D^{K,d}_{ij}(\underline{u}) = det(B)\sum_k\left[
 !!   \hat{\Psi}_{ijk}^T B^{-1}\underline{u}_k^K \right]
 !!   \mathcal{I}_d
 !! \f}
 !! with
 !! \f{displaymath}{
 !!  \hat{\Psi}_{ijk} = \int_{\hat{K}}
 !!   \hat{\varphi}_i\hat{\varphi}_k\nabla\hat{\varphi}_j\,d\hat{x}.
 !! \f}
 !! \note In the computation of \f$\hat{\Psi}\f$ we can use a higher
 !! accuracy than in the computation of the other terms. This has no
 !! computational overhead, since \f$\hat{\Psi}\f$ can be precomputed.
 !! In fact, we use in this implementation symbolic quadrature, which
 !! ensures high accuracy even for high order elements.
 !!
 !! \par Element based upwinding methods
 !!
 !! Let us now consider the upwinded formulations for the advection
 !! term.
 !!
 !! For the description of the N-scheme upwinding and of the PSI
 !! upwinding, it is convenient to take explicitly into account the
 !! fact that the finite element space is piecewise linear. This
 !! implies the fundamental relation
 !! \f{displaymath}{
 !!  \nabla\varphi_i = \frac{|s_i|}{d|K|} \underline{n}^{in}_i,
 !! \f}
 !! where \f$|s_i|\f$ is the area of the side \f$s_i\f$ opposite to
 !! the \f$i\f$-th vertex and \f$\underline{n}^{in}_i\f$ is the inward
 !! normal on \f$s_i\f$. The local matrix can then be computed as
 !! follows.
 !! <ul>
 !!  <li> <em>N-scheme</em>: the implementation uses the following
 !!  steps:
 !!  <ol>
 !!   <li> compute the average velocity
 !!   \f{displaymath}{
 !!    \overline{\underline{u}}^K = \frac{1}{|K|}\int_K
 !!    \underline{u}\,dx = \frac{1}{d+1}\sum_{j=1}^{d+1}
 !!    \underline{u}_j;
 !!   \f}
 !!   <li> define the \f$d+1\f$ quantities
 !!   \f{displaymath}{
 !!     K_i = \int_K \overline{\underline{u}}^K \cdot \nabla\varphi_i\,
 !!     dx = \frac{1}{d} \overline{\underline{u}}^K \cdot
 !!     |s_i|\underline{n}^{in}_i;
 !!   \f}
 !!   note that the \f$K_i\f$ are indeed the contravariant components
 !!   of \f$\overline{\underline{u}}^K\f$ given the covariant basis
 !!   \f$\frac{|s_i|}{d}\underline{n}^{in}_i\f$ (more precisely, any
 !!   \f$d\f$ of these \f$d+1\f$ vectors), and that they are uniquely
 !!   associated with \f$\overline{\underline{u}}^K\f$; for
 !!   simplicity, we use here subscripts despite the fact that this is
 !!   a covariant basis;
 !!   <li> identify inflow and outflow nodes as
 !!    <ul>
 !!     <li> \f$K_i \geq 0\f$ \f$\mapsto\f$ outflow
 !!     <li> \f$K_i < 0\f$ \f$\mapsto\f$ inflow;
 !!    </ul>
 !!   <li> compute \f$D^K(\underline{u})\f$ with the block structure
 !!   \f{displaymath}{
 !!    D^{K,d}_{ij} = K_i^+\left[ \delta_{ij} -
 !!    (1-\delta_{ij})\frac{K^-_j}{\sum_kK^-_k} \right] \mathcal{I}_d,
 !!   \f}
 !!    notice that this expression yields an M-matrix and is stable
 !!    for \f$\max_i(|K_i|)\to 0\f$.
 !!  </ol>
 !!  <li> <em>PSI</em>: this scheme follows the N-scheme algorithm but
 !!  introduces corrected coefficients \f$\tilde{K}_{i,i_d}\f$, which
 !!  can be thought as derived from a corrected advective velocity.
 !!  Notice the introduction of the additional subscript
 !!  \f$i_d=1,\ldots,d\f$: this is done because the modified
 !!  coefficients will be different for each component of the
 !!  (advected) velocity. To compute the \f$\tilde{K}_{i,i_d}\f$,
 !!  given the \f$K_i\f$ of the N-scheme, we proceed as follows:
 !!  <ol>
 !!   <li> compute
 !!   \f{displaymath}{
 !!    \tilde{\underline{u}}^K = \frac{1}{\sum_j K_j^-} \sum_j K_j^-
 !!    \underline{u}_j;
 !!   \f}
 !!   <li> compute the residuals and total residual
 !!   \f{displaymath}{
 !!    \underline{\rho}_i =
 !!    K_i^+(\underline{u}_i-\tilde{\underline{u}}^K), \qquad
 !!    \underline{\rho} = \sum_{j=1}^{d+1} \underline{\rho}_i;
 !!   \f}
 !!   <li> for all \f$i_d=1,\ldots,d\f$, let \f$y\f$ denote
 !!   \f$u_{i_d}\f$ and proceed as follows:
 !!   <ol>
 !!    <li> define the matrix
 !!    \f{displaymath}{
 !!     N_0 = \left[ \begin{array}{ccc}
 !!      \underline{n}_{k_1} & \ldots & \underline{n}_{k_{p_0}}
 !!     \end{array} \right],
 !!    \f}
 !!    where the \f$p_0\f$ indexes \f$k_1,\ldots,k_{p_0}\f$ identify
 !!    the nodes for which \f$\rho_{ki_d}\rho_{i_d}<0\f$;
 !!    <li> define the velocity correction
 !!    \f{displaymath}{
 !!      \underline{v}_y = \Pi_{\nabla}N_0
 !!      \left(N_0^T\Pi_{\nabla}N_0\right)^{-1}
 !!      N_0^T\,\overline{\underline{u}}^K,
 !!    \f}
 !!    where
 !!    \f{displaymath}{
 !!     \Pi_{\nabla} = \mathcal{I}_d -
 !!     \underline{\tau}\otimes\underline{\tau}, \qquad
 !!     \underline{\tau}=\frac{\nabla y}{\|\nabla y\|},
 !!    \f}
 !!    it is understood that \f$\underline{v}_y=0\f$ if \f$p_0=0\f$;
 !!    <li> compute the coefficients \f$\tilde{K}_{i,i_d}\f$ using the
 !!    modified velocity \f$\overline{\underline{u}}^K -
 !!    \underline{v}_y\f$, and proceed as in the case of the N-scheme
 !!    (notice that, by construction, we will have
 !!    \f$\tilde{K}_{k_1,i_d},\ldots,\tilde{K}_{k_{p_0},i_d} = 0\f$).
 !!   </ol>
 !!  </ol>
 !! </ul>
 !! \note The formula for the velocity correction
 !! \f$\underline{v}_y\f$ is obtained by writing the velocity
 !! correction as linear combination of a unit orthogonal basis of
 !! \f$\nabla y^\perp\f$ and minimizing \f$\|\underline{v}_y\|\f$
 !! under the constraints \f$\underline{v}_y\cdot\underline{n}_{k_s} =
 !! \overline{\underline{u}}^K\cdot\underline{n}_{k_s}\f$ for
 !! \f$s=1,\ldots,p_0\f$. Since \f$\underline{v}_y\f$ has \f$d-1\f$
 !! degrees of freedom, and there are \f$p_0\f$ constraints, \f$0\leq
 !! p_0\leq d-1\f$, the minimization condition plays a role for
 !! \f$d>2\f$, since in such case the constraints might not define
 !! \f$\underline{v}_y\f$ uniquely.
 !!
 !! Another possible element based advection stabilization is obtained
 !! by adding the term
 !! \f{displaymath}{
 !!   R^K_{ij}(\underline{u}) = \tau_{\underline{u}}^K \int_K
 !!   \left( \underline{u}\cdot\nabla\underline{\varphi}_j\right) \cdot
 !!   \left( \underline{u}\cdot\nabla\underline{\varphi}_i \right) \,dx
 !! \f}
 !! where \f$\tau_{\underline{u}}^K = Ch^2_K\f$. After some
 !! computations, it can be seen that such matrix has the block
 !! structure 
 !! \f{displaymath}{
 !!   R^{K,d}_{ij}(\underline{u}) = \tau_{\underline{u}}^K \left[
 !!   \sum_{k,h} \underline{u}_k^K\otimes\underline{u}_h^K \,:\,\int_K
 !!   \varphi_k\varphi_h \nabla\varphi_j\otimes\nabla\varphi_i \, dx
 !!   \right] \mathcal{I}_d
 !! \f}
 !! or, in terms of the reference element,
 !! \f{displaymath}{
 !!   R^{K,d}_{ij}(\underline{u}) = \tau_{\underline{u}}^K det(B)
 !!   \sum_{k,h} \left[ \hat{\Omega}_{khij} :
 !!   \left(B^{-1}\underline{u}_k^K\right) \otimes
 !!   \left(B^{-1}\underline{u}_h^K\right) \right] \mathcal{I}_d
 !! \f}
 !! with
 !! \f{displaymath}{
 !!  \hat{\Omega}_{khij}  = \int_{\hat{K}}
 !!   \hat{\varphi}_k \hat{\varphi}_h \nabla\hat{\varphi}_j \nabla\hat{\varphi}_i^T
 !!   \,d\hat{x}.
 !! \f}
 !! Notice that the computation of \f$\hat{\Omega}\f$ can be
 !! simplified exploiting the two symmetries
 !! \f{displaymath}{
 !!  \hat{\Omega}_{khij} = \hat{\Omega}_{hkij}, \qquad
 !!  \hat{\Omega}_{khij} = \hat{\Omega}_{khji}^T.
 !! \f}
 !!
 !! \par Side based upwinding methods
 !!
 !! Side based stabilizations are not considered here, so that the
 !! computed matrices correspond to the \c upw_none case.
 !!
 !! \todo This function should be adapted to the case of a manifold
 !! grid, when <tt>e\%m.neq.e\%d</tt>. The present implementation will
 !! produce an out-of-bound error in such a case.
 !!
 !! \todo The boundary forces are not implemented.
 !!
 !! \warning The implementation of the nonabsorbing boundary condition
 !! is very preliminary: it should be improved and documented.
 !<
 pure subroutine cg_ns_locmat(aak,bbk,fk,base,pbase,e,be,bcs_err, &
                              eb_p_stab,cck,div_mul,llk,uk)
  real(wp), intent(out) :: aak(:,:), bbk(:,:), fk(:), cck(:,:), llk(:)
  type(t_base), intent(in) :: base, pbase
  type(t_e), intent(in) :: e
  type(p_t_b_e), intent(in) :: be
  type(t_bcs_error), intent(out) :: bcs_err
  logical, intent(in) :: eb_p_stab, div_mul
  real(wp), intent(in), optional :: uk(:)
 
  integer :: j, js, je, i, is, ie, l, h
  real(wp) :: &
    wmu(base%m), f(e%d,base%m), xg(e%m,base%m), detb_bi(e%d,e%m), &
    bit(e%m,e%d), detb_bi_bit(e%d,e%d), phi_phi(e%d,e%d), lap,    &
    wf(e%d), bi_uk(e%d,base%pk), biu_biu(e%d,e%d,base%pk,base%pk),&
    h_k2, tau_k, wbcs(e%d,e%d,base%m)
  ! upwind specific variables
  logical :: psi_active
  integer :: nk_plus, nk_minus
  integer, allocatable :: k_plus(:), k_minus(:)
  real(wp) :: u_bar(e%d), u_tilde(e%d), kk(e%d+1), kk_tilde(e%d+1), &
    npk1d(e%d+1,e%d+1), npk(e%d,e%d+1,e%d+1), rho(e%d,e%d), rho_tot(e%d)
  character(len=*), parameter :: &
    this_sub_name = 'cg_ns_locmat'

   ! Gaussian nodes in physical space
   xg = affmap(e,base%xig)

   ! problem coefficients
   wmu = coeff_visc(xg) * base%wg
   f   = coeff_f(xg)

   do l=1,base%m
   wbcs(:,:,l) = reshape( (/ 0.0_wp , 0.0_wp , 0.0_wp , &
                             0.0_wp , 0.0_wp , 0.0_wp , &
                             0.0_wp , 0.0_wp , 1.0_wp*max( (xg(1,l)-15.0_wp)/5.0_wp , 0.0_wp ) &
                          /) ,(/e%d,e%d/) ) * base%wg(l)
   enddo

   ! precompute map coefficients
   detb_bi     = e%det_b*e%bi
   bit         = transpose(e%bi)
   detb_bi_bit = matmul(detb_bi,bit)

   if(eb_p_stab.or.(upwinding.eq.upw_elem)) h_k2 = diameter(e)**2

   ! advection term
   if(present(uk)) then
     upw_method1: select case(upwinding)

      case(upw_none,upw_side_sc,upw_elem)
       do i=1,base%pk
         is = (i-1)*e%d + 1
         ie =     i*e%d
         bi_uk(:,i) = matmul(detb_bi,uk(is:ie)) ! includes det(B)
       enddo
       if(upwinding.eq.upw_elem) then
         do i=1,base%pk
           do j=1,base%pk
             do h=1,e%d
               do l=1,e%d ! includes det(B)**2
                 biu_biu(h,l,i,j) = bi_uk(h,i) * bi_uk(l,j)
               enddo
             enddo
           enddo
         enddo
       endif

      case(upw_nscheme,upw_psi)

       ! 1) average velocity
       u_bar = nscheme_ubar(e%d,uk)

       ! 2) compute the K_i
       call nscheme_k_coeffs(e,u_bar,kk,nk_plus,k_plus,nk_minus,k_minus)

       flux_if: if(nk_minus.gt.0) then ! if there is flux on the element
         ! We first build the matrix of the N-scheme, and then correct
         ! it if required according to the PSI method.
         call nscheme_locmat(kk,nk_plus,k_plus,nk_minus,k_minus,npk1d)
         do h=1,e%d ! since the scheme is linear, no difference
           npk(h,:,:) = npk1d
         enddo

         ! PSI correction
         psi_if: if(upwinding.eq.upw_psi) then
           ! compute \tilde{u}
           u_tilde = nscheme_utilde(e%d,kk,k_minus,uk)
           ! compute the downwind residuals
           do i=1,nk_plus
             ! get start and end indexes
             is = (k_plus(i)-1)*e%d + 1
             ie =     k_plus(i)*e%d
             rho(:,i) = kk(k_plus(i)) * ( uk(is:ie) - u_tilde )
           enddo
           rho_tot = sum(rho(:,1:nk_plus),2)
           ! now work separately for each velocity component
           component_do: do h=1,e%d
             if(rho_tot(h).ne.0.0) then ! this also implies: grad.ne.0
               call psi_ktilde_coeffs(e%d,nk_plus,k_plus,      &
                 nk_minus,k_minus,kk,uk(h::e%d),rho(h,:), &
                 rho_tot(h),psi_active,kk_tilde)
               activate_psi_if: if(psi_active) then
                 call nscheme_locmat(kk_tilde,nk_plus,k_plus, &
                        nk_minus,k_minus,npk(h,:,:))
               endif activate_psi_if
             else
               npk(h,:,:) = 0.0_wp
             endif
           enddo component_do
         endif psi_if
       
       else
         npk = 0.0_wp ! set to zero the advection contribution
       endif flux_if

      case default
       aak = -huge(0.0_wp) ! one should never get here
       return
     end select upw_method1
   endif

   ! local matrices
   do j=1,base%pk
     ! get start and end indexes
     js = (j-1)*e%d + 1
     je =     j*e%d
     do i=1,base%pk
       ! precompute the integral
       phi_phi = 0.0_wp
       do l=1,base%m
         phi_phi = phi_phi + wmu(l)*phitens(:,:,l,i,j)
       enddo
       ! get start and end indexes
       is = (i-1)*e%d + 1
       ie =     i*e%d
       ! full term
       aak(is:ie,js:je) = matmul(bit,matmul(phi_phi,detb_bi))
       ! diagonal term
       lap = sum( detb_bi_bit * phi_phi )
       if(present(uk)) then ! add advection
         upw_method2: select case(upwinding)

          case(upw_none,upw_side_sc,upw_elem)
           if(asym_adv) then
             do h=1,base%pk
               lap = lap + 0.5_wp*dot_product(bi_uk(:,h),psi(:,h,j,i)-psi(:,h,i,j))
             enddo
           else
             do h=1,base%pk
               lap = lap + dot_product(bi_uk(:,h),psi(:,h,j,i))
             enddo
           endif
           ! for the element based advection stabilization, we have to
           ! include also the stabilization term
           if(upwinding.eq.upw_elem) then
             tau_k = u_stab_coeff/e%det_b*h_k2
             lap = lap + tau_k * sum( omega(:,:,:,:,i,j)*biu_biu )
           endif

          case(upw_nscheme,upw_psi)
           ! the PSI contribution is different for each component: it
           ! can not be accumulated in lap and must be accumulated
           ! directly in aak
           do h=0,e%d-1
             aak(is+h,js+h) = aak(is+h,js+h) + npk(h+1,i,j)
           enddo

         end select upw_method2
       endif
       do h=0,e%d-1
         aak(is+h,js+h) = aak(is+h,js+h) + lap
       enddo
       ! nonabsorbing boundary conditions
       do l=1,base%m
         aak(is:ie,js:je) = aak(is:ie,js:je)                   &
                           + wbcs(:,:,l)*base%p(i,l)*base%p(j,l)
       enddo
     enddo
     do i=1,pbase%pk
       bbk(i,js:je) = matmul(phi_q(:,i,j),detb_bi)
     enddo
   enddo

   ! forcing term
   fk = 0.0_wp
   do l=1,base%m
     wf = base%wg(l) * f(:,l)
     do i=1,base%pk
       ! get start and end indexes
       is = (i-1)*e%d + 1
       ie =     i*e%d
       fk(is:ie) = fk(is:ie) + wf * base%p(i,l)
     enddo
   enddo
   fk = e%det_b*fk

   ! Pressure stabilization
   if(eb_p_stab) then
     tau_k = -p_stab_coeff * h_k2
     do j=1,pbase%pk
       do i=1,j-1 ! j+1,pbase%pk filled by symmetry
         cck(i,j) = tau_k * sum( detb_bi_bit * qtens(:,:,i,j) )
         cck(j,i) = cck(i,j)
       enddo
       cck(j,j) = tau_k * sum( detb_bi_bit * qtens(:,:,j,j) )
     enddo
   else
     cck = 0.0_wp
   endif

   ! Lagrange multiplier for the incompressibility constraint
   if(div_mul) then
     llk = e%det_b * int_q
   endif

   ! add boundary fluxes if present
   bcs_err%lerr = .false.
   if(associated(be%p)) then
     call cg_ns_boundary_flux(aak,fk,base,be,asym_adv,bcs_err,uk)
   endif
   !--------------------------------------------------------------------
 
 end subroutine cg_ns_locmat
 
!-----------------------------------------------------------------------
 
 !> Include boundary terms in the local matrices
 !!
 !! The boundary term resulting from the antisymmetric form of the
 !! advection opertor
 !! \f{displaymath}{
 !!  \frac{1}{2}\sum_k u^K_k 
 !!   \int_{\partial K \cap \partial \Omega}
 !!  \left(\underline{\varphi}_k \cdot \underline{n}\right)
 !!  \left(\underline{\varphi}_j \cdot \underline{\varphi}_i\right)
 !!  d\xi
 !! \f}
 !! has the following block structure
 !! \f{displaymath}{
 !!   \widetilde{D}^{\partial K,d}_{ij}(\underline{u}) = \frac{1}{2}
 !!   \sum_{e\in\partial K\cap\partial\Omega} \sum_k \left[
 !!   \underline{u}_k^K \cdot \underline{n} \int_e
 !!   \varphi_k\varphi_j\varphi_i\,d\xi \right] \mathcal{I}_d.
 !! \f}
 !!
 !! \warning Currently only the boundary terms resulting from the
 !! antisymmetric advection form are implemented.
 !! \todo Implement the boundary terms.
 !<
 pure subroutine cg_ns_boundary_flux(aak,fk,base,be,asym_adv,err,uk)
  real(wp), intent(inout) :: aak(:,:), fk(:)
  type(t_base), intent(in) :: base!, pbase
  type(p_t_b_e), intent(in) :: be
  logical, intent(in) :: asym_adv
  type(t_bcs_error), intent(out) :: err
  real(wp), intent(in), optional :: uk(:)

  integer :: isb, isl, i, is, ie, j, js, k
  real(wp) :: aun(base%pk), d_b
  character(len=*), parameter :: &
    this_sub_name = 'cg_ns_boundary_flux'
 
   err%lerr = .false.
   do isb=1,be%p%nbs ! loop on the boundary sides
     btype: select case(be%p%bs(isb)%p%bc)

      case(b_dir,b_ddc)
       ! nothing to do in this subroutine

      case(b_neu)
       
       if(present(uk).and.asym_adv) then
         isl = be%p%bs(isb)%p%s%isl(1) ! local index of the side
         do k=1,base%pk ! compute the    uk \cdot n
           is = (k-1)*be%p%e%d + 1
           ie =     k*be%p%e%d
           aun(k) = dot_product(uk(is:ie),be%p%e%n(:,isl))
         enddo
         aun = (0.5_wp*be%p%e%s(isl)%p%a/base%me%voldm1) * aun
         do j=1,base%pk
           js = (j-1)*be%p%e%d + 1
           do i=1,base%pk
             is = (i-1)*base%me%d + 1
             d_b = sum( aun * psib(:,j,i,isl) )
             do k=0,be%p%e%d-1
               aak(is+k,js+k) = aak(is+k,js+k) + d_b
             enddo
           enddo
         enddo
       endif

!       isl = be%p%bs(isb)%p%s%isl(1) ! local index of the side
!       ! Gaussian nodes in physical space
!       xgs = affmap(be%p%e,base%xigb(:,:,isl))
!       ! side weights reordered and scaled by the side area
!       wgsa = (be%p%e%s(isl)%p%a/base%me%voldm1) * &
!                   base%wgs(base%stab(be%p%e%ip(isl),:))
!
!       ! problem coefficients
!       breg = -be%p%bs(isb)%p%s%ie(2) ! boundary region
!       gamma = coeff_rob(xgs,breg)
!       hb    = coeff_neu(xgs,breg)
!
!       btypespec: select case(be%p%bs(isb)%p%btype)
!
!        case(1) ! Neumann, Hughes flux
!
!         ! advection coefficient
!         a  = coeff_adv(xgs)
!         an = matmul(transpose(a),be%p%e%s(isl)%p%n)
!         ap = 0.5_wp*(an + abs(an)) + gamma
!
!         do l=1,base%ms
!           do i=1,base%pk
!             do j=1,base%pk
!               aak(i,j) = aak(i,j) + wgsa(l) * &
!                 base%pb(j,l,isl) * ap(l) * base%pb(i,l,isl)
!             enddo
!             fk(i) = fk(i) - wgsa(l) * hb(l) * base%pb(i,l,isl)
!           enddo
!         enddo
!         
!        case default
!         err%lerr = .true.
!         write(err%message,'(A,I5,A,I5,A,I5,A)') &
!           'Unknown boundary condition type "',be%p%bs(isb)%p%btype, &
!           '" on side ',be%p%bs(isb)%p%s%i,', on element ',be%p%e%i,'.'
!       end select btypespec
      case default
       err%lerr = .true.
       write(err%message,'(A,A,A,I5,A,I5,A,I5,A)') &
         'In ',this_sub_name,': unknown boundary condition "', &
         be%p%bs(isb)%p%bc,'" on side ',be%p%bs(isb)%p%s%i,    &
         ', on element ',be%p%e%i,'.'
     end select btype
   enddo

 end subroutine cg_ns_boundary_flux

!-----------------------------------------------------------------------
 
 !> Pressure stabilization.
 !!
 !! This subroutine computes the local stabilization matrix for a
 !! pressure stabilization of the form
 !! \f{displaymath}{
 !!  s(p_h,q_h) = -\sum_{e\in\mathcal{E}_h} \tau^e \int_e
 !!   [\hspace*{-1.6pt}[\nabla p_h]\hspace*{-1.6pt}]
 !!   [\hspace*{-1.6pt}[\nabla q_h]\hspace*{-1.6pt}]\, d\xi,
 !! \f}
 !! where the normal jump is defined as
 !! \f{displaymath}{
 !!   [\hspace*{-1.6pt}[\nabla q_h]\hspace*{-1.6pt}] =
 !!    (\nabla q)^1\cdot\underline{n}^1 +
 !!    (\nabla q)^2\cdot\underline{n}^2
 !! \f}
 !! and the stabilization parameter is
 !! \f{displaymath}{
 !!   \tau^e = C h_e^3
 !! \f}
 !! for some positive constant \f$C\f$. The local matrix is
 !! associated with a side \f$e\in\mathcal{E}_h\f$ and is given as
 !! \f{displaymath}{
 !!  S^e_{IJ} = -\tau^e\int_e
 !!   [\hspace*{-1.6pt}[\nabla q_I]\hspace*{-1.6pt}]
 !!   [\hspace*{-1.6pt}[\nabla q_J]\hspace*{-1.6pt}]\, d\xi,
 !! \f}
 !! where we use capitalized subscripts because they refer to the
 !! generalized element \f$K^1_e\cup K^2_e\f$. In fact, in \f$S^e\f$
 !! there is a full coupling between the shape functions of the two
 !! elements \f$K^1_e\f$ and \f$K^2_e\f$ connected to \f$e\f$ (meaning
 !! that each function of any of the two elements is coupled with all
 !! the functions of both elements).
 !!
 !! We split the computation of the local matrix in two steps:
 !! <ul>
 !!  <li> assume that the \f$q_i\f$ are discontinuous and nonzero only
 !!  in one element
 !!  <li> take into account the continuity of the \f$q_i\f$.
 !! </ul>
 !! In the first step we compute the matrix
 !! \f{displaymath}{
 !!  \tilde{S}^e_{(i_Ki)(j_Kj)} = -\tau^e\int_e
 !!   \left( (\nabla q_{(i_Ki)})\cdot\underline{n}^{i_K} \right)
 !!   \left( (\nabla q_{(j_Kj)})\cdot\underline{n}^{j_K} \right)\, d\xi,
 !! \f}
 !! where \f$(i_Ki)\f$ is a multi-index composed by an element index
 !! \f$i_K=1,2\f$ and a basis function index \f$i=1,\ldots,p_k\f$.
 !! Notice that \f$\tilde{S}^e\f$ has dimension \f$2p_k\times2p_k\f$.
 !!
 !! In the second step, we use a correspondence
 !! \f{displaymath}{
 !!  (i_Ki) \mapsto I(i_Ki)
 !! \f}
 !! and we assemble the contributions by a double loop on \f$(i_Ki)\f$
 !! and \f$(j_Kj)\f$ setting
 !! \f{displaymath}{
 !!  S^e_{I(i_Ki)J(j_Kj)} +\hspace*{-3pt}= \tilde{S}^e_{(i_Ki)(j_Kj)}.
 !! \f}
 !! For nodes which do not belong to the side \f$e\f$ there is only
 !! one multi-index which is mapped on the capitalized one, while for
 !! nodes placed on \f$e\f$ there are two. Therefore, each
 !! \f$S^e_{IJ}\f$ will receive one contribution, if neither \f$I\f$
 !! nor \f$J\f$ are placed on the side, two contributions if one index
 !! is placed on the side and four contributions if both indexes are
 !! placed on the side.
 !!
 !! We now discuss in more details the computations required for the
 !! first step, i.e. the evaluation of \f$\tilde{S}^e\f$. Essentially,
 !! this is the integral of a scalar function, which is easily
 !! transformed on the reference side \f$\hat{e}\f$ by a
 !! multiplication by the map Jacobian. Less obvious is however
 !! evaluating the integrand
 !! \f{displaymath}{
 !!   (\nabla q_{(i_Ki)})\cdot\underline{n}^{i_K}.
 !! \f}
 !! Since both the gradient and the normal transform from reference to
 !! physical space as 1-forms (up to a normalization constant for
 !! \f$\underline{n}\f$), i.e. as
 !! \f{displaymath}{
 !!   \nabla q = B^{-T}\nabla \hat{q}, \qquad
 !!   \underline{n} =
 !!   \frac{1}{\sqrt{\underline{\hat{n}}^TB^{-1}
 !!   B^{-T}\underline{\hat{n}}}} B^{-T}\underline{\hat{n}},
 !! \f}
 !! the scalar product in physical space (that is, the scalar product
 !! appearing in \f$S^e\f$) is
 !! \f{displaymath}{
 !!   \nabla q\cdot \underline{n} =
 !!   \frac{1}{\sqrt{\underline{\hat{n}}^TB^{-1}
 !!   B^{-T}\underline{\hat{n}}}}
 !!   \nabla\hat{q}^TB^{-1}B^{-T}\underline{\hat{n}}.
 !! \f}
 !! Notice that it is not possible to express \f$\partial_nq = \nabla
 !! q\cdot\underline{n}\f$ as a function of
 !! \f$\partial_{\hat{n}}\hat{q} = \nabla
 !! \hat{q}\cdot\underline{\hat{n}}\f$; this is in fact a consequence
 !! of taking a scalar products between 1-forms and not a duality
 !! between a 1-form and a vector. That said, since in the
 !! implementation the physical normal is already available, the
 !! simplest way to evaluate the integrand is
 !! \f{displaymath}{
 !!   \nabla q\cdot \underline{n} =
 !!   \nabla\hat{q}^TB^{-1}\underline{n}.
 !! \f}
 !!
 !! Finally, concerning the second step, it is best done in the
 !! calling subroutine, where the global indexes are known. By doing
 !! this, the map \f$I(i_Ki)\f$ is provided directly by the
 !! correspondence between global and local degrees of freedom, and
 !! the summation code will look like the following (letting \c tssk
 !! denote \f$\tilde{S}^e\f$):
 !! \code
 !! ! Second step: compute S
 !! do jk=1,2
 !!   do j=1,pbase%pk
 !!     jj = (jk-1)*pbase%pk + j ! index in tssk
 !!     jjj = jjj(jk,j)          ! global index
 !!     do ik=1,2
 !!       do i=1,pbase%pk
 !!         ii = (ik-1)*pbase%pk + i ! index in tssk
 !!         iii = iii(ik,i)          ! global index
 !!         S(iii,jjj) = S(iii,jjj) + tssk(ii,jj)
 !!       enddo
 !!     enddo
 !!   enddo
 !! enddo
 !! \endcode
 !!
 !! \warning This function can be called only for internal sides; in
 !! fact there is no need to add any stabilization on boundary sides.
 !! For domain decomposition sides this function can still be used
 !! provided that a suitable ghost element is defined.
 !<
 pure subroutine cg_ns_locpstab(tssk,pbase,s,bcs_err)
  real(wp), intent(out) :: tssk(:,:) !< (2*pbase\%pk,2*pbase\%pk)
  type(t_base), intent(in) :: pbase
  type(t_s), intent(in) :: s
  type(t_bcs_error), intent(out) :: bcs_err
 
  integer :: ik, jk, ll(pbase%ms), i, ii, j, jj, l, isl(2)
  real(wp) :: tau_e, h
  real(wp) :: &
    bi_n(pbase%me%d,2), wgsa(pbase%ms), ngrad(pbase%pk,2,pbase%ms)
  character(len=*), parameter :: &
    this_sub_name = 'cg_ns_locpstab'

   ! Check that we are not working on a boundary side
   bcs_err%lerr = .false.
   if(.not.associated(s%e(2)%p)) then
     bcs_err%lerr = .true.
     write(bcs_err%message,'(A,I5,A)') &
      'Can not compute the pressure stabilization for side ', &
      s%i,', this side seems to be a boundary side.'
     return
   endif

   ! precompute some geometric quantities
   isl = s%isl
   do ik=1,2
     bi_n(:,ik) = matmul( s%e(ik)%p%bi , s%e(ik)%p%n(:,isl(ik)) )
   enddo
   ! we include the coefficient tau^e in the quadrature weights
   h = diameter(s)
   tau_e = -p_stab_coeff * h**3
   wgsa = tau_e * (s%a/pbase%me%voldm1) * pbase%wgs
   ! precompute the integrand functions: ngrad (normal gradients)
   do ik=1,2
     ll = pbase%stab(s%e(ik)%p%pi(isl(ik)),:)
     do i=1,pbase%pk
       do l=1,pbase%ms
         ngrad(i,ik,l) = dot_product(                   &
           pbase%gradpb(:,i,ll(l),isl(ik)) , bi_n(:,ik) )
       enddo
     enddo
   enddo

   ! First step: compute \tilde{S}
   tssk = 0.0_wp
   do l=1,pbase%ms
     do jk=1,2
       do j=1,pbase%pk
         jj = (jk-1)*pbase%pk + j
         do ik=1,2
           do i=1,pbase%pk
             ii = (ik-1)*pbase%pk + i
             tssk(ii,jj) = tssk(ii,jj) +             &
               wgsa(l) * ngrad(j,jk,l) * ngrad(i,ik,l)
           enddo
         enddo
       enddo
     enddo
   enddo
 
 end subroutine cg_ns_locpstab
 
!-----------------------------------------------------------------------

 !> Side based advection stabilization.
 !!
 !! This subroutine computes the local stabilization matrix for an
 !! advection stabilization of the form
 !! \f{displaymath}{
 !!  s_{\underline{w}}(\underline{u}_h,\underline{v}_h) =
 !!  \sum_{e\in\mathcal{E}_h} \tau^e \int_e
 !!  [\hspace*{-1.6pt}[\hspace*{-1.6pt}[
 !!  \underline{w}\cdot\nabla\underline{u}_h
 !!  ]\hspace*{-1.6pt}]\hspace*{-1.6pt}] :
 !!  [\hspace*{-1.6pt}[\hspace*{-1.6pt}[
 !!  \underline{w}\cdot\nabla\underline{v}_h
 !!  ]\hspace*{-1.6pt}]\hspace*{-1.6pt}]\, d\xi,
 !! \f}
 !! where \f$\underline{w}\f$ is a single valued vector field on the
 !! side \f$e\f$, the stabilization parameter is
 !! \f{displaymath}{
 !!   \tau^e = C h_e^3
 !! \f}
 !! for some positive constant \f$C\f$ and the complete jump is
 !! defined, for a generic vector \f$\underline{q}\f$, as
 !! \f{displaymath}{
 !!  [\hspace*{-1.6pt}[\hspace*{-1.6pt}[
 !!   \underline{q}
 !!  ]\hspace*{-1.6pt}]\hspace*{-1.6pt}] = 
 !!    \underline{q}^1\otimes\underline{n}^1 +
 !!    \underline{q}^2\otimes\underline{n}^2
 !! \f}
 !! \note While the normal jump \f$ [\hspace*{-1.6pt}[ \underline{q}
 !! ]\hspace*{-1.6pt}] \f$ measures the jump in the sole normal
 !! component of \f$\underline{q}\f$, the complete jump considers the
 !! jumps in both the normal and the tangential components of the
 !! vector.
 !!
 !! Most of the comments that can be
 !! found in <tt>cg_ns_locpstab</tt> apply to this function as well,
 !! with the following main differences.
 !! <ul>
 !!  <li> In the pressure stabilization, \f$\nabla p_h\f$ is known to
 !!  have continuous tangential component, so that the normal jump can
 !!  be used, while the advective derivative has discontinuities in
 !!  both components which must be taken into account.
 !!  <li> The integrand function includes the vector
 !!  \f$\underline{w}\f$, which must be provided in terms of its
 !!  values at the side quadrature points. Notice that this is
 !!  different from what is done in other functions of this module,
 !!  where the advective velocity is represented in terms of its
 !!  coefficients with respect to the velocity base. However, this is
 !!  a reasonable choice here since there is no velocity base directly
 !!  associated with the sides of the mesh.
 !!  <li> Each pair \f$(\varphi_i,\varphi_j)\f$ of scalar basis
 !!  functions produces a \f$d\times d\f$ block (see later for further
 !!  details).
 !! </ul>
 !!
 !! \warning As for \c cg_ns_locpstab, this function can be called
 !! only for internal sides.
 !!
 !! \par Implementation details
 !!
 !! We proceed as for <tt>cg_ns_locpstab</tt>, using test and shape
 !! vector functions of the form \f$\varphi_i\underline{e}_{i_d}\f$.
 !! The local matrix then has the block structure
 !! \f{displaymath}{
 !!  \tilde{S}^{e,d}_{(i_Ki)(j_Kj)} = \tau^e \left[ \int_e
 !!   \left( \underline{n}^{i_K}\cdot\underline{n}^{j_K} \right)
 !!   \left( \underline{w}\cdot\nabla\varphi_{(i_Ki)} \right)
 !!   \left( \underline{w}\cdot\nabla\varphi_{(j_Kj)} \right)\, d\xi
 !!  \right]\mathcal{I}_d.
 !! \f}
 !! Notice that \f$\tilde{S}^e\f$ has dimension
 !! \f$2p_kd\times2p_kd\f$.
 !!
 !! In practice, this function computes the \f$2p_k\times2p_k\f$
 !! matrix
 !! \f{displaymath}{
 !!  \tilde{S}^{e,1}_{(i_Ki)(j_Kj)} = \tau^e \int_e
 !!   \left( \underline{n}^{i_K}\cdot\underline{n}^{j_K} \right)
 !!   \left( \underline{w}\cdot\nabla\varphi_{(i_Ki)} \right)
 !!   \left( \underline{w}\cdot\nabla\varphi_{(j_Kj)} \right)\, d\xi,
 !! \f}
 !! representing the stabilization term for a scalar variable. The
 !! repetition of the stabilization terms for all the velocity
 !! components is left for the calling function, since it is best
 !! dealt with during the summation step.
 !<
 pure subroutine cg_ns_locustab(tssk,base,s,w,bcs_err)
  real(wp), intent(out) :: tssk(:,:) !< (2*base\%pk,2*base\%pk)
  type(t_base), intent(in) :: base
  type(t_s), intent(in) :: s
  !> The single valued vector \c w is refered to the <em>side</em>
  !! ordering of the quadrature nodes and has dimension
  !! (base\%me\%d,base\%ms).
  !<
  real(wp), intent(in) :: w(:,:)
  type(t_bcs_error), intent(out) :: bcs_err
 
  integer :: ik, jk, ll(base%ms), i, ii, j, jj, l, isl(2)
  real(wp) :: tau_e, h
  real(wp) :: &
    bi_w(base%me%d,base%ms,2), wgsa(base%ms), wgrad(base%pk,2,base%ms)
  character(len=*), parameter :: &
    this_sub_name = 'cg_ns_locustab'
  ! the parameter nn gives the sign of n^{ik} \cdot n^{jk}
  real(wp), parameter :: nn(2,2) = reshape(             &
    (/ 1.0_wp , -1.0_wp , -1.0_wp , 1.0_wp /) , (/2,2/) )

   ! Check that we are not working on a boundary side
   bcs_err%lerr = .false.
   if(.not.associated(s%e(2)%p)) then
     bcs_err%lerr = .true.
     write(bcs_err%message,'(A,I5,A)') &
      'Can not compute the advection stabilization for side ', &
      s%i,', this side seems to be a boundary side.'
     return
   endif

   ! precompute some geometric quantities
   bi_w(:,:,1) = matmul( s%e(1)%p%bi , w )
   bi_w(:,:,2) = matmul( s%e(2)%p%bi , w )
   isl = s%isl
   ! we include the coefficient tau^e in the quadrature weights
   h = diameter(s)
   tau_e = u_stab_coeff * h**3
   wgsa = tau_e * (s%a/base%me%voldm1) * base%wgs
   ! precompute the integrand functions: wgrad (w-gradients)
   do ik=1,2
     ll = base%stab(s%e(ik)%p%pi(isl(ik)),:)
     do i=1,base%pk
       do l=1,base%ms
         wgrad(i,ik,l) = dot_product(                    &
           base%gradpb(:,i,ll(l),isl(ik)) , bi_w(:,l,ik) )
       enddo
     enddo
   enddo

   ! First step: compute \tilde{S}
   tssk = 0.0_wp
   do l=1,base%ms
     do jk=1,2
       do j=1,base%pk
         jj = (jk-1)*base%pk + j
         do ik=1,2
           do i=1,base%pk
             ii = (ik-1)*base%pk + i
             tssk(ii,jj) = tssk(ii,jj) + nn(ik,jk) * &
               wgsa(l) * wgrad(j,jk,l) * wgrad(i,ik,l)
           enddo
         enddo
       enddo
     enddo
   enddo
 
 end subroutine cg_ns_locustab
 
!-----------------------------------------------------------------------
 
 !> Computation of the local (velocity) mass matrix.
 !!
 !! The local matrix is
 !! \f{displaymath}{
 !!  M^K_{ij} = \int_K
 !!  \underline{\varphi}_j \cdot \underline{\varphi}_i \,dx.
 !! \f}
 !! The local mass matrix has a block structure where each block is
 !! given by
 !! \f{displaymath}{
 !!  M^{K,d}_{ij} = \left[\int_K \varphi_j \varphi_i \,dx\right]
 !!  \mathcal{I}_d.
 !! \f}
 !! In terms of the reference element, we readily obtain
 !! \f{displaymath}{
 !!  M^{K,d}_{ij} = \left[det(B)\, \hat{\Xi}_{ij}\right]
 !!  \mathcal{I}_d
 !! \f}
 !! where
 !! \f{displaymath}{
 !!  \hat{\Xi}_{ij} = \int_{\hat{K}} \hat{\varphi}_j \hat{\varphi}_i
 !!  \,d\hat{x}.
 !! \f}
 !<
 pure subroutine cg_ns_locmassmat(mmk,base,e)
  real(wp), intent(out) :: mmk(:,:)
  type(t_base), intent(in) :: base
  type(t_e), intent(in) :: e
 
  integer :: i, j, k, is, js
  character(len=*), parameter :: &
    this_sub_name = 'cg_ns_locmassmat'

   mmk = 0.0_wp
   do j=1,base%pk
     js = (j-1)*base%me%d ! j shift
     do i=1,base%pk
       is = (i-1)*base%me%d ! i shift
       do k=1,base%me%d
         mmk(is+k,js+k) = e%det_b * csi(i,j)
       enddo
     enddo
   enddo
 
 end subroutine cg_ns_locmassmat
 
!-----------------------------------------------------------------------
 
 !> Computation of the right hand side with explicit advection (see \c
 !! cg_ns_locmat for the implicit case).
 !!
 !! When using centered advection, the rhs vector is
 !! \f{displaymath}{
 !!  \begin{array}{rcl}\displaystyle
 !!  f^K_{i} &=&\displaystyle
 !!  \int_K \left( \underline{b} +\sigma
 !!  (\underline{f}-\underline{u}\cdot\nabla\underline{u})\right)
 !!   \cdot \underline{\varphi}_i \,dx \\\
 !!   &= &\displaystyle
 !!   \sum_j b_j^K \int_K \underline{\varphi}_j\cdot
 !!     \underline{\varphi}_i \,dx + \sigma\int_K \underline{f} 
 !!   \cdot \underline{\varphi}_i \,dx -\sigma\sum_{jk}
 !!   u_j^Ku_k^K \int_K \left(\nabla\underline{\varphi}_j
 !!   \underline{\varphi}_k \right)
 !!   \cdot \underline{\varphi}_i \,dx.
 !!  \end{array}
 !! \f}
 !! This vector has thus a block structure where each block is
 !! given by
 !! \f{displaymath}{
 !!  f^{K,d}_{i} = det(B)\sum_j \hat{\Xi}_{ij}\,\underline{b}_j^K
 !!   + \sigma\int_K \underline{f}\varphi_i\,dx
 !!   - \sigma\, det(B)\sum_{jk}\left[
 !!   \hat{\Psi}_{ijk}^T B^{-1}\underline{u}_k^K \right]\,
 !!   \underline{u}_j^K.
 !! \f}
 !!
 !! When using N-scheme/PSI upwinding, the nonlinear residuals are
 !! computed according to the N-scheme and then, if using the PSI
 !! method, limited accordingly. These two schemes are better
 !! described with a formalism which is different from the one used
 !! for the centered scheme and which exploits the fact that the
 !! velocity is piecewise linear. In more details, the two schemes are
 !! defined as follows.
 !! <ul>
 !!  <li> <em>N-scheme</em>: the implementation uses the following
 !!  steps:
 !!  <ol>
 !!   <li> compute the average velocity
 !!   \f{displaymath}{
 !!    \overline{\underline{u}}^K = \frac{1}{|K|}\int_K
 !!    \underline{u}\,dx = \frac{1}{d+1}\sum_{j=1}^{d+1}
 !!    \underline{u}_j;
 !!   \f}
 !!   <li> define the \f$d+1\f$ quantities
 !!   \f{displaymath}{
 !!     K_i = \int_K \overline{\underline{u}}^K \cdot \nabla\varphi_i\,
 !!     dx = \frac{1}{d} \overline{\underline{u}}^K \cdot
 !!     |s_i|\underline{n}^{in}_i,
 !!   \f}
 !!   where \f$|s_i|\f$ is the area of the side \f$s_i\f$ opposite to
 !!   the \f$i\f$-th vertex and \f$\underline{n}^{in}_i\f$ is the
 !!   inward normal on \f$s_i\f$;
 !!   <li> identify inflow and outflow nodes as
 !!    <ul>
 !!     <li> \f$K_i \geq 0\f$ \f$\mapsto\f$ outflow
 !!     <li> \f$K_i < 0\f$ \f$\mapsto\f$ inflow;
 !!    </ul>
 !!   <li> compute
 !!   \f{displaymath}{
 !!    \tilde{\underline{u}}^K = \frac{1}{\sum_j K_j^-} \sum_j K_j^-
 !!    \underline{u}_j;
 !!   \f}
 !!   <li> compute the residuals
 !!   \f{displaymath}{
 !!    \underline{\rho}_i =
 !!    K_i^+(\underline{u}_i-\tilde{\underline{u}}^K);
 !!   \f}
 !!   <li> compute the right hand side
 !!   \f{displaymath}{
 !!    f^{K,d}_{i} = det(B)\sum_j \hat{\Xi}_{ij}\,\underline{b}_j^K
 !!   + \sigma\int_K \underline{f}\varphi_i\,dx
 !!   - \sigma\, \underline{\rho}_i.
 !!   \f}
 !!  </ol>
 !!  <li> <em>PSI</em>: this scheme follows the N-scheme algorithm
 !!  until point 5, and then adds the following limiting procedure:
 !!  <ol>
 !!   <li> compute the total residual
 !!   \f{displaymath}{
 !!    \underline{\rho} = \sum_{j=1}^{d+1} \underline{\rho}_i;
 !!   \f}
 !!   <li> set \f$\underline{\beta}=0\f$; for all
 !!   \f$i=1,\ldots,d+1\f$, for all \f$k=1,\ldots,d\f$
 !!    <ul>
 !!     <li> if \f$ \rho_{ik} \rho_k \geq 0\f$ then set
 !!     \f$\tilde{\rho}_{ik} = \rho_{ik}\f$
 !!     <li> else set \f$\tilde{\rho}_{ik}=0\f$ and \f$\beta_k =
 !!     \beta_k - \rho_{ik}\f$;
 !!    </ul>
 !!   <li> compute the right hand side as in step 6 of the N-scheme
 !!   using the corrected residuals
 !!   \f$\frac{\underline{\rho}}{\underline{\rho}+\underline{\beta}}
 !!   \tilde{\underline{\rho}}_i\f$.
 !!  </ol>
 !! </ul>
 !<
 pure subroutine cg_ns_locrhs(fk,base,e,sigma,bk,uk)
  type(t_base), intent(in) :: base
  type(t_e),    intent(in) :: e
  real(wp),     intent(in) :: sigma
  real(wp),     intent(in) :: bk(:), uk(:)
  real(wp),    intent(out) :: fk(:)

  integer :: l, i, is, ie, j, js, je, k
  real(wp) :: &
    f(base%me%d,base%m), xg(base%me%d,base%m), wf(base%me%d), &
    bi_uk(base%me%d,base%pk), utbipsi
  ! upwind specific variables
  integer :: nk_plus, nk_minus
  integer, allocatable :: k_plus(:), k_minus(:)
  real(wp) :: u_bar(e%d), u_tilde(e%d), rho(e%d*(e%d+1)), kk(e%d+1)
  real(wp) :: rho_tot(e%d), rhod(e%d+1), beta, beta_coeff


   ! Gaussian nodes in physical space
   xg = affmap(e,base%xig)

   ! problem coefficients
   f = coeff_f(xg)

   ! initialization
   fk = 0.0_wp

   ! forcing term
   do l=1,base%m
     wf = base%wg(l) * f(:,l)
     do i=1,base%pk
       ! get start and end indexes
       is = (i-1)*base%me%d + 1
       ie =     i*base%me%d
       fk(is:ie) = fk(is:ie) + wf * base%p(i,l)
     enddo
   enddo
   ! determinant and time step added later

   ! advection
   upw_method: select case(upwinding)

    case(upw_none)
     do i=1,base%pk
       ! get start and end indexes
       is = (i-1)*base%me%d + 1
       ie =     i*base%me%d
       bi_uk(:,i) = matmul(e%bi,uk(is:ie)) ! does not include det(B)
     enddo
     do i=1,base%pk
       ! get start and end indexes
       is = (i-1)*base%me%d + 1
       ie =     i*base%me%d
       do j=1,base%pk
         ! get start and end indexes
         js = (j-1)*base%me%d + 1
         je =     j*base%me%d
         utbipsi = 0.0_wp
         do k=1,base%pk
           utbipsi = utbipsi + dot_product(bi_uk(:,k),psi(:,k,j,i))
         enddo
         fk(is:ie) = fk(is:ie) - utbipsi * uk(js:je)
       enddo
     enddo

    case(upw_nscheme,upw_psi)

     ! 1) average velocity
     u_bar = nscheme_ubar(e%d,uk)

     ! 2,3) compute the K_i
     call nscheme_k_coeffs(e,u_bar,kk,nk_plus,k_plus,nk_minus,k_minus)
     ! divide the K by det_b as in the centered case
     kk = kk/e%det_b

     flux_if: if(nk_minus.gt.0) then ! if there is flux on the element
       ! 4) compute \tilde{u}
       u_tilde = nscheme_utilde(e%d,kk,k_minus,uk)

       ! 5) compute the residuals
       rho = 0.0_wp
       do i=1,nk_plus
         ! get start and end indexes
         is = (k_plus(i)-1)*e%d + 1
         ie =     k_plus(i)*e%d
         rho(is:ie) = kk(k_plus(i)) * ( uk(is:ie) - u_tilde )
       enddo
       
       psi_if: if(upwinding.eq.upw_psi) then
         ! 1) compute the total flux
         rho_tot = 0.0_wp
         do i=1,e%d+1
           ! get start and end indexes
           is = (i-1)*e%d + 1
           ie =     i*e%d
           rho_tot = rho_tot + rho(is:ie)
         enddo
         ! 2) compute the normalization coefficients and residuals
         do j=1,e%d
           if(rho_tot(j).ne.0.0_wp) then
             beta = 0.0_wp
             do i=1,e%d+1
               rhod(i) = rho( (i-1)*e%d + j )
               if(rho_tot(j)*rhod(i).lt.0.0_wp) then
                 beta = beta - rhod(i)
                 rhod(i) = 0.0_wp
               endif
             enddo
             ! Notice: if we are here rho_tot(j).ne.0.0_wp and the
             ! following is always well defined
             beta_coeff = rho_tot(j)/(rho_tot(j)+beta)
             do i=1,e%d+1
               k = (i-1)*e%d + j
               rho(k) = beta_coeff * rhod(i)
             enddo
           else
             do i=1,e%d+1
               k = (i-1)*e%d + j
               rho(k) = 0.0_wp
             enddo
           endif
         enddo

       endif psi_if

     else
       rho = 0.0_wp
     endif flux_if

     ! 6) add the contribution to the right hand side
     fk = fk - rho

    case default
     fk = huge(0.0_wp) ! one should never get here
   end select upw_method
   ! add the time step
   fk = sigma*fk

   ! mass matrix
   do i=1,base%pk
     ! get start and end indexes
     is = (i-1)*base%me%d + 1
     ie =     i*base%me%d
     do j=1,base%pk
       js = (j-1)*base%me%d + 1
       je =     j*base%me%d
       fk(is:ie) = fk(is:ie) + csi(i,j)*bk(js:je)
     enddo
   enddo
   ! add the determinant
   fk = e%det_b*fk

 end subroutine cg_ns_locrhs
 
!-----------------------------------------------------------------------
 
 !> Simple function to compute \f$\overline{\underline{u}}^K\f$
 !! according to the N-scheme.
 !<
 pure function nscheme_ubar(d,uk) result(u_bar)
  integer, intent(in) :: d
  real(wp), intent(in) :: uk(:)
  real(wp) :: u_bar(d)

  integer :: i,j,k
 
   u_bar = 0.0_wp
   do j=1,d+1 ! vertices
     do i=1,d ! space dimensions
       k = (j-1)*d + i
       u_bar(i) = u_bar(i) + uk(k)
     enddo
   enddo
   u_bar = u_bar/real(d+1,wp)
 
 end function nscheme_ubar
 
!-----------------------------------------------------------------------
 
 !> Simple procedure to compute the coefficients \f$K_i\f$ of the
 !! N-scheme.
 !!
 !! The coefficients are also partitioned into nonnegative and
 !! negative coefficients, indexed by \c k_plus and \c k_minus,
 !! respectively.
 !<
 pure subroutine nscheme_k_coeffs(e,u_bar,kk,                    &
                                  nk_plus,k_plus,nk_minus,k_minus)
  type(t_e), intent(in) :: e
  real(wp), intent(in) :: u_bar(:)
  integer, intent(out) :: nk_plus, nk_minus
  integer, allocatable, intent(out) :: k_plus(:), k_minus(:)
  real(wp), intent(out) :: kk(:)

  integer :: j, kp(e%d+1), km(e%d)
  real(wp) :: u_bar_n(e%d+1)

   nk_plus  = 0
   nk_minus = 0

   u_bar_n = -matmul(u_bar,e%n)

   do j=1,e%d+1 ! vertices
     kk(j) = e%s(j)%p%a/real(e%d,wp) * u_bar_n(j)
     if(kk(j).ge.0.0_wp) then
       nk_plus = nk_plus + 1
       kp(nk_plus) = j
     else
       nk_minus = nk_minus + 1
       km(nk_minus) = j
     endif
   enddo

   allocate(k_plus(nk_plus),k_minus(nk_minus))
   k_plus  = kp(1:nk_plus )
   k_minus = km(1:nk_minus)
 
 end subroutine nscheme_k_coeffs
 
!-----------------------------------------------------------------------
 
 !> Simple function to compute \f$\tilde{\underline{u}}^K\f$ according
 !! to the N-scheme.
 !!
 !! \warning The array \c k_minus must contain only valid indexes, and is
 !! always used as a whole.
 !!
 !! \note The <tt>kk(k_minus)</tt> are strictly negative, so that the
 !! \c k_coeff are always well defined and bounded. If the \c kk are
 !! very small, the \c k_coeff can be very inaccurate, but this is not
 !! a problem since they will be multiplied by \c kk in the following.
 !<
 pure function nscheme_utilde(d,kk,k_minus,uk) result(u_tilde)
  integer, intent(in) :: d, k_minus(:)
  real(wp), intent(in) :: kk(:), uk(:)
  real(wp) :: u_tilde(d)

  real(wp) :: k_coeff(d+1)
 
   k_coeff = 0.0_wp
   k_coeff(k_minus) = kk(k_minus) / sum(kk(k_minus))
   u_tilde = matmul(reshape(uk,(/d,d+1/)),k_coeff)
 end function nscheme_utilde
 
!-----------------------------------------------------------------------
 
 !> Build the local matrix for the N-scheme
 pure subroutine nscheme_locmat(kk,nk_plus,k_plus,nk_minus,k_minus,npk)
  integer, intent(in) :: nk_plus, k_plus(:), nk_minus, k_minus(:)
  real(wp), intent(in) :: kk(:)
  real(wp), intent(out) :: npk(:,:)

   integer :: i, j, ii, jj
   real(wp) :: kk_sum
 
   ! rescale the negative coefficients
   kk_sum = sum(kk(k_minus))
   npk = 0.0_wp
   do i=1,nk_plus ! row
     ii = k_plus(i)
     npk(ii,ii) = 1.0_wp ! (diagonal entry)
     do j=1,nk_minus ! columns (off-diagonal)
       jj = k_minus(j)
       npk(ii,jj) = -kk(jj)/kk_sum
     enddo
     ! rescale the row
     npk(ii,:) = kk(ii)*npk(ii,:)
   enddo
 
 end subroutine nscheme_locmat
 
!-----------------------------------------------------------------------
 
 !> This subroutine computes the corrected coefficients of the PSI
 !! method.
 !!
 !! The flag \c active indicates whether the PSI nonlinear mechanism
 !! is active or not. If <tt>active.eqv..false<tt>, the coefficients
 !! of the N-scheme should be used, and \c kk_tilde is left undefined.
 !! Notice that the local indexes k_plus and k_minus are not affected
 !! by the PSI mechanism, so the values of the N-scheme can be reused.
 !!
 !! \note This subroutine operates on a scalar unknown \c c, which can
 !! be used as a single velocity component.
 !!
 !! \warning The residuals \c rho are assumed to follow the numbering
 !! of the k_plus vertexes, i.e. the inflow vertexes are not counted.
 !<
 pure subroutine psi_ktilde_coeffs(d,nk_plus,k_plus,nk_minus,k_minus, &
                                   kk,c,rho,rho_tot,active,kk_tilde)
  integer, intent(in) :: d, nk_plus, k_plus(:), nk_minus, k_minus(:)
  real(wp), intent(in) :: kk(:), c(:), rho(:), rho_tot
  logical, intent(out) :: active
  real(wp), intent(out) :: kk_tilde(:)

  real(wp), parameter :: toll = 1e5_wp*epsilon(0.0_wp)
  integer :: i, ii, p0, pp, k_plus0(size(k_plus)-1), k_plusp(size(k_plus))
  real(wp) :: mm(d+3,d+3), b(d+3), x(d+3)

   p0 = 0
   pp = 0
   do i=1,nk_plus ! vertices
     ! The following tolerance criterion guarantees that the matrix
     ! mmi will be invertible, but at the same time handles correctly
     ! the case rho_tot -> 0
     if( rho(i)*sign(1.0_wp,rho_tot) .lt. -toll*abs(rho_tot) ) then
       p0 = p0+1
       k_plus0(p0) = i ! store the index
     else
       pp = pp+1
       k_plusp(pp) = i
     endif
   enddo

   if(p0.gt.0) then
     active = .true.

     ! Build the linear system: the unknowns are ordered as
     !  [ K_1 , K_2 , ... , K_{d+1} , \phi_1 , \phi_2 ]
     mm = 0.0_wp; b = 0.0_wp

     ! p0 equations to set to zero the new K
     do i=1,p0
       ii = k_plus(k_plus0(i)) ! vertex index
       mm(ii,ii) = 1.0_wp
     enddo

     ! pp equations for the positive K
     do i=1,pp
       ii = k_plus(k_plusp(i)) ! vertex index
       mm(ii,ii) = 1.0_wp
       mm(ii,d+2) = -kk(ii)
     enddo

     ! nk_minus equations for the negative K
     do i=1,nk_minus
       ii = k_minus(i) ! vertex index
       mm(ii,ii) = 1.0_wp
       mm(ii,d+3) = -kk(ii)
     enddo

     ! one equation for the zero sum condition
     mm(d+2,1:d+1) = 1.0_wp

     ! one equation to impose the orthogonality of the new velocity to
     ! the gradient
     do i=1,d+1
       mm(d+3,i) =        c(i)
        b(d+3) = b(d+3) + c(i) * kk(i)
     enddo

     ! compute the new K
     call linsys(mm,b,x)
     kk_tilde = x(1:d+1)
     
     ! Here one should check whether x(d+2:d+3) > 0. If this is not
     ! true, than something wrong has happened, since a node has
     ! changed from upwind to downwind during the PSI procedure.

   else
     active = .false.
   endif

 end subroutine psi_ktilde_coeffs
 
!-----------------------------------------------------------------------

end module mod_cg_ns_locmat

